#-*- codeing = utf-8 -*-
#@Time : 2020/9/8 15:58
#@Author : 阳某
#@File : 动物科学技术学院.py
#@Software : PyCharm

import requests
import re
from lxml import etree
from openpyxl import Workbook
wb = Workbook()
wa = wb.active
urls = [
        'http://cast1.cau.edu.cn/col/col19127/index.html',
        'http://cast1.cau.edu.cn/col/col19148/index.html',
        'http://cast1.cau.edu.cn/col/col19551/index.html',
        'http://cast1.cau.edu.cn/col/col19553/index.html',
        'http://cast1.cau.edu.cn/col/col19553/index.html',
       ]
headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
}
for i in range(len(urls)):
    print("正在打印第%s个链接"%i)
    respo = requests.get(urls[i],headers = headers)
    print(respo.status_code)
    respo.encoding = respo.apparent_encoding
    # print(respo.text)
    data_moban = re.findall("<record>(.*?)</record>",respo.text,re.S)
    print(data_moban)
    print(len(data_moban))
    for da in data_moban:
        line = []
        data = re.findall("href='(.*?)'",da,re.S)[0]
        print(data)
        if len(data)>40:
            url = re.findall("href='(.*?)'", da, re.S)[0]
        else:
            url = 'http://cast1.cau.edu.cn'+ data
            try:
                res = requests.get(url, headers=headers, verify=False, timeout=3)
                print(res.status_code)
            except:
                print("超时了")
                continue
            res.encoding = res.apparent_encoding
            emai = re.findall(
                r"[\w!#$%&'*+/=?^_`{|}~-]+(?:\.[\w!#$%&'*+/=?^_`{|}~-]+)*@(?:[\w](?:[\w-]*[\w])?\.)+[\w](?:[\w-]*[\w])?",
                res.text, re.S)
            print(emai)
            e = etree.HTML(res.text)
            if len(emai) >= 1:
                emai = re.findall(
                    r"[\w!#$%&'*+/=?^_`{|}~-]+(?:\.[\w!#$%&'*+/=?^_`{|}~-]+)*@(?:[\w](?:[\w-]*[\w])?\.)+[\w](?:[\w-]*[\w])?",
                    res.text, re.S)[0]
                try:
                    name = e.xpath('//td[@class="title"]/text()')[0].strip()  # ....4
                    line.append(name)
                    line.append(emai)
                    print(line)
                    wa.append(line)
                except:
                    print("错了")
            else:
                print('没有邮箱')
wb.save('./动物科学技术学院.xlsx')